library(tidyverse)
library(plotly)
homeruns <- read_csv("2019_homeruns_batters.csv")
## Warning: Duplicated column names deduplicated: 'pitcher' =>
## 'pitcher_1' [60], 'fielder_2' => 'fielder_2_1' [61]
## Warning: 1323 parsing failures.
## row col expected actual file
## 2851 release_speed a double null '2019_homeruns_batters.csv'
## 2851 release_pos_x a double null '2019_homeruns_batters.csv'
## 2851 release_pos_z a double null '2019_homeruns_batters.csv'
## 2851 zone a double null '2019_homeruns_batters.csv'
## 2851 pfx_x a double null '2019_homeruns_batters.csv'
## .... ............. ........ ...... ...........................
## See problems(...) for more details.
triples <- read_csv("2019_triples_batters.csv")
## Warning: Duplicated column names deduplicated: 'pitcher' =>
## 'pitcher_1' [60], 'fielder_2' => 'fielder_2_1' [61]
doubles <- read_csv("2019_doubles_batters.csv")
## Warning: Duplicated column names deduplicated: 'pitcher' =>
## 'pitcher_1' [60], 'fielder_2' => 'fielder_2_1' [61]
## Warning: 1618 parsing failures.
## row col expected actual file
## 1057 hc_x a double null '2019_doubles_batters.csv'
## 1057 hc_y a double null '2019_doubles_batters.csv'
## 1227 hit_distance_sc a double null '2019_doubles_batters.csv'
## 1448 hit_distance_sc a double null '2019_doubles_batters.csv'
## 3672 hit_distance_sc a double null '2019_doubles_batters.csv'
## .... ............... ........ ...... ..........................
## See problems(...) for more details.
singles <- read_csv("2019_singles_batters.csv")
## Warning: Duplicated column names deduplicated: 'pitcher' =>
## 'pitcher_1' [60], 'fielder_2' => 'fielder_2_1' [61]
## Warning: 5095 parsing failures.
## row col expected actual file
## 4645 hit_distance_sc a double null '2019_singles_batters.csv'
## 4894 hit_distance_sc a double null '2019_singles_batters.csv'
## 10890 hit_distance_sc a double null '2019_singles_batters.csv'
## 12882 release_speed a double null '2019_singles_batters.csv'
## 12882 release_pos_x a double null '2019_singles_batters.csv'
## ..... ............... ........ ...... ..........................
## See problems(...) for more details.
It is possible to download a csv from the website that includes all hits from a season. However, the site caps the number of rows in a csv at 40,000, which is 2307 less than the number of hits in 2019. In order to be as accurate as possible, I had to download csvs of each type of hit separately, then combine them using rbind().
all_hits <- rbind(homeruns, triples, doubles, singles)
I increased the transparency of the points to show where most of the points clustered. According to this visualization, the highest concentration of homeruns were hit between 100 and 105 miles per hour and at a 25-30 degree launch angle.
ggplot(homeruns, aes(x=launch_speed, y = launch_angle))+
geom_point(alpha=.1)
ggplot(homeruns, aes(x=launch_speed, y = launch_angle, color = release_speed))+
geom_point(alpha=.2)
There are over 40k points, so I had to increase the transparency even more to show where the concentration of hits lie.
ggplot(all_hits,
aes(x = launch_speed, y = launch_angle))+
geom_point(alpha=.03)
Unsurprisingly, ground balls have a low launch angle (most below 0 degrees) and a wide spread of speeds that resulted in hits. Fly balls have a high launch angle. The gap between the two clusters of fly ball hits between 75 and 80 mph is unexplained and requires more research.
ggplot(all_hits,
aes(x = launch_speed, y = launch_angle, color = bb_type, shape = events))+
geom_point(alpha=.4)
There is another field in the data set called “des” that is an indepth description of everything that happened during the play. I added that field to the tooltip hover over.
ggplotly(
ggplot(all_hits,
aes(x = launch_speed, y = launch_angle, color = events,
text = des))+
geom_point(alpha=.1),
tooltip = "text"
)
ggplot(all_hits, aes(x = launch_speed, y = launch_angle, color = events))+
geom_point(alpha=.1)+
facet_wrap(vars(pitch_name))
# changing the hit distance variable and release speed variables, which should be continuous integers but are the character type, into integers.
all_hits_2019 <- mutate(all_hits, hit_distance_sc = as.integer(hit_distance_sc))
## Warning: NAs introduced by coercion
all_hits_2019 <- mutate(all_hits_2019, release_speed = as.integer(release_speed))
## Warning: NAs introduced by coercion
ggplot(all_hits_2019, aes(y=launch_angle, x= hit_distance_sc, color = launch_speed))+
geom_point(alpha=.05)+
scale_colour_gradient(low = "blue", high = "red")
## Warning: Removed 1320 rows containing missing values (geom_point).
ggplot(all_hits_2019, aes(x = hit_distance_sc, y = release_speed, color = launch_speed))+
geom_point(alpha = .1)+
scale_colour_gradient(low = "blue", high = "red")+
facet_wrap(vars(events))
## Warning: Removed 1320 rows containing missing values (geom_point).
ggplotly(
ggplot(all_hits_2019, aes(y=launch_angle, x= hit_distance_sc, color = launch_speed))+
geom_point(alpha=.05)+
facet_wrap(vars(events))+
scale_colour_gradient(low = "blue", high = "red")
)
rendon = filter(all_hits, player_name == "Anthony Rendon")
ggplotly(
ggplot(rendon,
aes(x = launch_speed, y = launch_angle, color = events,
text = des))+
geom_point(alpha=.5),
tooltip = "text"
)
library(tidyverse) library(shiny) library(plotly)
ui <- fluidPage( titlePanel(“MLB Hits 2019”),
sidebarLayout(
sidebarPanel(
textInput(inputId = "name", label = "Player Name:", value = "enter a player name"),
sliderInput('sampleSize', 'Sample Size', min = 1, max = nrow(all_hits),
value = 1000, step = 500, round = 0)
),
mainPanel(
plotlyOutput(outputId = "plot1")
)
)
)
server <- function(input, output) {
dataset <- reactive({
all_hits[sample(nrow(all_hits), input$sampleSize),]
})
output$plot1 <- renderPlotly({
if (input$name %in% unique(all_hits$player_name)) {
pd <- filter(dataset(),
player_name == input$name)
} else {
pd <- dataset()
}
ggplot(pd,
aes(x = launch_speed, y = launch_angle, color = events,
text = des
))+
geom_point(alpha=.4)
}
)
}
shinyApp(ui = ui, server = server) ```